import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
def dame_variables_categoricas(dataset=None):
'''
----------------------------------------------------------------------------------------------------------
Función dame_variables_categoricas:
----------------------------------------------------------------------------------------------------------
-Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las
variables categóricas
-Inputs:
-- dataset: Pandas dataframe que contiene los datos
-Return:
-- lista_variables_categoricas: lista con los nombres de las variables categóricas del
dataset de entrada con menos de 100 valores diferentes
-- 1: la ejecución es incorrecta
'''
if dataset is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
df_fraud = pd.read_csv("../data/Base.csv")
df_fraud.head()
| fraud_bool | income | name_email_similarity | prev_address_months_count | current_address_months_count | customer_age | days_since_request | intended_balcon_amount | payment_type | zip_count_4w | velocity_6h | velocity_24h | velocity_4w | bank_branch_count_8w | date_of_birth_distinct_emails_4w | employment_status | credit_risk_score | email_is_free | housing_status | phone_home_valid | phone_mobile_valid | bank_months_count | has_other_cards | proposed_credit_limit | foreign_request | source | session_length_in_minutes | device_os | keep_alive_session | device_distinct_emails_8w | device_fraud_count | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.9 | 0.166828 | -1 | 88 | 50 | 0.020925 | -1.331345 | AA | 769 | 10650.765523 | 3134.319630 | 3863.647740 | 1 | 6 | CA | 185 | 0 | BA | 1 | 0 | 24 | 0 | 500.0 | 0 | INTERNET | 3.888115 | windows | 0 | 1 | 0 | 7 |
| 1 | 1 | 0.9 | 0.296286 | -1 | 144 | 50 | 0.005418 | -0.816224 | AB | 366 | 534.047319 | 2670.918292 | 3124.298166 | 718 | 3 | CA | 259 | 1 | BA | 0 | 0 | 15 | 0 | 1500.0 | 0 | INTERNET | 31.798819 | windows | 0 | 1 | 0 | 7 |
| 2 | 1 | 0.9 | 0.044985 | -1 | 132 | 40 | 3.108549 | -0.755728 | AC | 870 | 4048.534263 | 2893.621498 | 3159.590679 | 1 | 14 | CB | 177 | 1 | BA | 0 | 1 | -1 | 0 | 200.0 | 0 | INTERNET | 4.728705 | other | 0 | 1 | 0 | 7 |
| 3 | 1 | 0.9 | 0.159511 | -1 | 22 | 50 | 0.019079 | -1.205124 | AB | 810 | 3457.064063 | 4054.908412 | 3022.261812 | 1921 | 6 | CA | 110 | 1 | BA | 0 | 1 | 31 | 1 | 200.0 | 0 | INTERNET | 2.047904 | linux | 0 | 1 | 0 | 7 |
| 4 | 1 | 0.9 | 0.596414 | -1 | 218 | 50 | 0.004441 | -0.773276 | AB | 890 | 5020.341679 | 2728.237159 | 3087.670952 | 1990 | 2 | CA | 295 | 1 | BA | 1 | 0 | 31 | 0 | 1500.0 | 0 | INTERNET | 3.775225 | macintosh | 1 | 1 | 0 | 7 |
Objetivos del trabajo
Queremos hacer un análisis de la posibilidad de fraude de los clientes que se han abierto una cuenta en el banco en los últimos ocho meses. Este es uno de los grandes análisis que deben hacer las entidades financieras y es lo que debemos resolver.
from sklearn.model_selection import train_test_split
X = df_fraud.drop(columns=['fraud_bool'], inplace=False)
y = df_fraud['fraud_bool']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=69)
from sklearn.model_selection import train_test_split
df_fraud_train, df_fraud_test = train_test_split(df_fraud, test_size=0.2, random_state=69)
dimension=df_fraud.shape, df_fraud.drop_duplicates().shape
dimension
((1000000, 32), (1000000, 32))
dimension_train=X_train.shape, X_train.drop_duplicates().shape
dimension_train
((800000, 31), (800000, 31))
X_train.dtypes
income float64 name_email_similarity float64 prev_address_months_count int64 current_address_months_count int64 customer_age int64 days_since_request float64 intended_balcon_amount float64 payment_type object zip_count_4w int64 velocity_6h float64 velocity_24h float64 velocity_4w float64 bank_branch_count_8w int64 date_of_birth_distinct_emails_4w int64 employment_status object credit_risk_score int64 email_is_free int64 housing_status object phone_home_valid int64 phone_mobile_valid int64 bank_months_count int64 has_other_cards int64 proposed_credit_limit float64 foreign_request int64 source object session_length_in_minutes float64 device_os object keep_alive_session int64 device_distinct_emails_8w int64 device_fraud_count int64 month int64 dtype: object
df_fraud_fraud_bool = df_fraud['fraud_bool']\
.value_counts(normalize=True)\
.mul(100).rename('percent').reset_index()
df_fraud_fraud_bool_conteo = df_fraud['fraud_bool'].value_counts().reset_index()
df_fraud_fraud_bool_pc = pd.merge(df_fraud_fraud_bool, df_fraud_fraud_bool_conteo, on=['index'], how='inner')
df_fraud_fraud_bool_pc
| index | percent | fraud_bool | |
|---|---|---|---|
| 0 | 0 | 98.8971 | 988971 |
| 1 | 1 | 1.1029 | 11029 |
fig = px.histogram(df_fraud_fraud_bool_pc, x="index", y=['percent'])
fig.show()
Gracias al gráfico y a la tabla anterior, observamos que el porcentaje de fraude dentro de nuestros datos es del 1,1%.
columns_with_special_missing_values = {
'prev_address_months_count': -1,
'current_address_months_count': -1,
'intended_balcon_amount': 0, # Ahora usamos 0 para representar valores menores a 0
'bank_months_count': -1,
'session_length_in_minutes': -1,
'device_distinct_emails_8w': -1
}
# Sustituir valores nulos (representados con -1 o con valores negativos)
for col, special_value in columns_with_special_missing_values.items():
if special_value is not None:
if col == 'intended_balcon_amount':
df_fraud[col] = df_fraud[col].apply(lambda x: x if x >= special_value else pd.NA)
else:
df_fraud[col] = df_fraud[col].apply(lambda x: x if x != special_value else pd.NA)
pd_series_null_columns = df_fraud.isnull().sum().sort_values(ascending=False)
pd_series_null_rows = df_fraud.isnull().sum(axis=1).sort_values(ascending=False)
print(pd_series_null_columns.shape, pd_series_null_rows.shape)
pd_null_columnas = pd.DataFrame(pd_series_null_columns, columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(pd_series_null_rows, columns=['nulos_filas'])
pd_null_filas['target'] = df_fraud['fraud_bool'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/df_fraud.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/df_fraud.shape[1]
(32,) (1000000,)
df_fraud.shape
(1000000, 32)
pd_null_columnas
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| intended_balcon_amount | 742523 | 0.742523 |
| prev_address_months_count | 712920 | 0.712920 |
| bank_months_count | 253635 | 0.253635 |
| current_address_months_count | 4254 | 0.004254 |
| session_length_in_minutes | 2015 | 0.002015 |
| device_distinct_emails_8w | 359 | 0.000359 |
| fraud_bool | 0 | 0.000000 |
| foreign_request | 0 | 0.000000 |
| phone_mobile_valid | 0 | 0.000000 |
| has_other_cards | 0 | 0.000000 |
| proposed_credit_limit | 0 | 0.000000 |
| device_os | 0 | 0.000000 |
| source | 0 | 0.000000 |
| housing_status | 0 | 0.000000 |
| keep_alive_session | 0 | 0.000000 |
| device_fraud_count | 0 | 0.000000 |
| phone_home_valid | 0 | 0.000000 |
| credit_risk_score | 0 | 0.000000 |
| email_is_free | 0 | 0.000000 |
| income | 0 | 0.000000 |
| employment_status | 0 | 0.000000 |
| date_of_birth_distinct_emails_4w | 0 | 0.000000 |
| bank_branch_count_8w | 0 | 0.000000 |
| velocity_4w | 0 | 0.000000 |
| velocity_24h | 0 | 0.000000 |
| velocity_6h | 0 | 0.000000 |
| zip_count_4w | 0 | 0.000000 |
| payment_type | 0 | 0.000000 |
| days_since_request | 0 | 0.000000 |
| customer_age | 0 | 0.000000 |
| name_email_similarity | 0 | 0.000000 |
| month | 0 | 0.000000 |
threshold=0.9
list_vars_not_null = list(pd_null_columnas[pd_null_columnas['porcentaje_columnas']<threshold].index)
df_fraud_filter_null = df_fraud.loc[:, list_vars_not_null]
df_fraud_filter_null.shape
(1000000, 32)
pd_null_filas
| nulos_filas | target | porcentaje_filas | |
|---|---|---|---|
| 415911 | 5 | 0 | 0.15625 |
| 539043 | 5 | 0 | 0.15625 |
| 315076 | 5 | 0 | 0.15625 |
| 980028 | 5 | 0 | 0.15625 |
| 281065 | 5 | 0 | 0.15625 |
| ... | ... | ... | ... |
| 285543 | 0 | 0 | 0.00000 |
| 285550 | 0 | 0 | 0.00000 |
| 285551 | 0 | 0 | 0.00000 |
| 878673 | 0 | 0 | 0.00000 |
| 500000 | 0 | 0 | 0.00000 |
1000000 rows × 3 columns
list_cat_vars, other = dame_variables_categoricas(dataset=df_fraud_filter_null)
df_fraud_filter_null[list_cat_vars] = df_fraud_filter_null[list_cat_vars].astype("category")
df_fraud_filter_null[list_cat_vars].head()
| bank_months_count | device_distinct_emails_8w | fraud_bool | foreign_request | phone_mobile_valid | has_other_cards | device_os | source | housing_status | keep_alive_session | device_fraud_count | phone_home_valid | email_is_free | employment_status | date_of_birth_distinct_emails_4w | payment_type | customer_age | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24 | 1 | 1 | 0 | 0 | 0 | windows | INTERNET | BA | 0 | 0 | 1 | 0 | CA | 6 | AA | 50 | 7 |
| 1 | 15 | 1 | 1 | 0 | 0 | 0 | windows | INTERNET | BA | 0 | 0 | 0 | 1 | CA | 3 | AB | 50 | 7 |
| 2 | NaN | 1 | 1 | 0 | 1 | 0 | other | INTERNET | BA | 0 | 0 | 0 | 1 | CB | 14 | AC | 40 | 7 |
| 3 | 31 | 1 | 1 | 0 | 1 | 1 | linux | INTERNET | BA | 0 | 0 | 0 | 1 | CA | 6 | AB | 50 | 7 |
| 4 | 31 | 1 | 1 | 0 | 0 | 0 | macintosh | INTERNET | BA | 1 | 0 | 1 | 1 | CA | 2 | AB | 50 | 7 |
list_cat_vars
['bank_months_count', 'device_distinct_emails_8w', 'fraud_bool', 'foreign_request', 'phone_mobile_valid', 'has_other_cards', 'device_os', 'source', 'housing_status', 'keep_alive_session', 'device_fraud_count', 'phone_home_valid', 'email_is_free', 'employment_status', 'date_of_birth_distinct_emails_4w', 'payment_type', 'customer_age', 'month']
df_fraud_filter_null['employment_status'].value_counts()
CA 730252 CB 138288 CF 44034 CC 37758 CD 26522 CE 22693 CG 453 Name: employment_status, dtype: int64
Tal y como podemos observar, la mayoría de nulos se encuentran principalmente en los "employment status" CA y CB.
df_fraud.to_csv("../data/df_fraud_na.csv")